# 01_sample_descriptives.R
# Generate descriptive summaries of housing authorities in sample 
# Produces:
# - Main Table 2: Characteristics of Housing Authorities in Sample versus 
#                 All Voucher-Administering Housing Authorities by Quartile
# - Table S2 - Characteristics of housing authorities in sample by data source

library(tidyverse)
library(xtable)
library(here)
options(scipen=999)


# Load data on PHAs in sample
pha_pref_ref <- readRDS(here("data", "pha_pref_ref.RDS"))

# Load data on PHA characteristics
pha_all <-  readRDS(here("data", "pha_all.RDS"))

# Join PHA characteristics with in-sample status
phas <- left_join(pha_pref_ref, pha_all, by = "pha_code")

###########################################################################
## Generate Main Table 2: Characteristics of Housing Authorities in Sample 
## versus All Voucher-Administering Housing Authorities by Quartile
###########################################################################

# Summarize PHA attributes by quartile for continuous variables
# Arguments: samp_df -  observations in selected sample
#            set_name - label for sample
generate_desc_contin <- function(samp_df, set_name) {
  descriptives_init <- samp_df %>%
    transmute(hud_med_month_vouchers, 
              hud_count_all_units,
              derived_acs_not_hispanic_or_latinowhite_alone_percent, 
              derived_acs_not_hispanic_or_latinoblack_or_african_american_alone_percent, 
              derived_acs_not_hispanic_or_latinoasian_alone_percent,
              derived_acs_hisp_any_perc, derived_acs_veteran_percent, 
              derived_acs_below_100_percent_of_the_poverty_level_percent, 
              derived_acs_in_the_labor_forceunemployed_percent,
              derived_acs_living_in_household_with_supplemental_security_income_ssi_cash_public_assistance_income_or_food_stampssnap_in_the_past_12_months_percent,
              derived_median_hh_income, derived_median_rental_burden,
              derived_acs_renter_occupied_percent, derived_acs_vacant_percent,
              perc_tracts_urban, 
              urban_affordable_noassist_per100, 
              urban_affordable_per100, 
              sharkey_commorg_densper_1000_inpov, 
              human_services_densper_1000_inpov, 
              repub_2016) %>%
    summarize_all(list(~quantile(., 0.25, na.rm = TRUE), 
                       ~quantile(., 0.5, na.rm = TRUE), 
                       ~quantile(., 0.75, na.rm = TRUE)))
  
  # Reshape output 
  descriptives <- descriptives_init %>%
    pivot_longer(everything(), names_to = "variable") %>%
    mutate(variable = str_remove(variable, "_quantile")) %>%
    separate(variable, into = c("variable", "quartile"), sep = "\\.\\.") %>%
    # Convert select variables to percent
    mutate_at(vars(starts_with("value")), 
              list(~ifelse(str_detect(variable, "_percent$|^repub|perc_|_perc"),
                                             . * 100, .))) %>%
    mutate(quartile = paste0("q", quartile, "_", set_name)) %>%
    pivot_wider(names_from = quartile, values_from = value) 

  return(descriptives)
}

# Generate descriptives for full universe and PHAs in sample
# for continuous variables
desc_pop_cont <- generate_desc_contin(phas, "full")
desc_sample_cont <- generate_desc_contin(filter(phas, in_sample_abt == TRUE), 
                                         "sample") 

# Bind descriptives for continuous variables together
descriptives_cont_joined <- desc_sample_cont %>%
  left_join(desc_pop_cont, by = "variable") 

colnames(descriptives_cont_joined) <- c("Variable", "Q1 - PHAs in Sample", 
                                        "Q2 - PHAs in Sample", 
                                        "Q3 - PHAs in Sample",
                                        "Q1 - All PHAs",  "Q2 - All PHAs", 
                                        "Q3 - All PHAs")

var_row_labels <- c("Vouchers administered", 
                    "Total units served",
                    "% White alone", "% Black alone", "% Asian alone", 
                    "% Hispanic",
                    "% Veteran", "% Below poverty", 
                    "% Unemployed", "% Receiving SSI/TANF/SNAP", 
                    "Median household income", "Median rent burden",
                    "% Units renter-occupied", "% Units vacant",
                    "% Census tracts in metropolitan area",
                    "# Affordable per 100, without assistance", 
                    "# Affordable per 100, with assistance",
                    "# Community organizations per 1,000", 
                    "# Human services organizations per 1,000", 
                    "% Republican vote, 2016")

descriptives_cont_joined <- add_column(descriptives_cont_joined,
                                       variable_lab = var_row_labels,
                                       .before = "Variable")
                                       

# Summarize PHA attributes by quartile for categorical/binary variables
# Arguments: samp_df -  observations in selected sample
#            set_name - label for sample
generate_desc_cat <- function(samp_df, set_name) {
  # Proportion in each Census region 
  descriptives_1 <- samp_df %>%
    group_by(census_region) %>%
    summarize(count = n()) %>%
    ungroup() %>%
    mutate(prop = (count / sum(count)) * 100) %>%
    select(census_region, prop) %>%
    rename(variable = census_region,
           value = prop)
  
  # Proportion with local CoC
  descriptives_2 <- samp_df %>%
    group_by(coc_category_noimpute) %>%
    summarize(count = n()) %>%
    ungroup() %>%
    mutate(prop = (count / sum(count)) * 100) %>%
    select(coc_category_noimpute, prop) %>%
    rename(variable = coc_category_noimpute,
           value = prop)
  
  descriptives <- bind_rows(descriptives_1, descriptives_2) %>%
    filter(!is.na(variable))
  
  colnames(descriptives)[2] <- paste0("value_", set_name)
  return(descriptives)
}

# Generate descriptives for full universe and PHAs in sample
# for categorical variables
desc_pop_cat <- generate_desc_cat(phas, "full")
desc_sample_cat <- generate_desc_cat(filter(phas, in_sample_abt == TRUE ), "sample") 

# Join categorical descriptives together
descriptives_cat_joined <- desc_sample_cat %>%
  left_join(desc_pop_cat, by = "variable") 

colnames(descriptives_cat_joined) <- c("Variable", "PHAs in Sample", "All PHAs")

# Output table 
print(xtable(descriptives_cont_joined[,-2], 
             digits = 0, align = "llrrrrrr", 
             caption = "Characteristics of housing authorities in sample vs all voucher-administering housing authorities"), 
      comment = F, table.placement = "!htpb",
      include.rownames = FALSE)

print(xtable(descriptives_cat_joined, 
             digits = 0, align = "llrr", 
             caption = "Characteristics of housing authorities in sample vs all voucher-administering housing authorities"), 
      comment = F, table.placement = "!htpb",
      include.rownames = FALSE)

######################################################################
## Generate Table S2 
## Characteristics of housing authorities in sample by data source
######################################################################

# Clean up data source info
source_info_all <- phas %>% 
  filter(in_sample_abt == 1) %>%
  mutate(source = case_when(in_sample == TRUE & from_plan == TRUE ~ "plan",
                            in_sample == TRUE & from_email == 1 ~ "email",
                            in_sample == TRUE & from_website == 1 ~ "website",
                            in_sample == FALSE & in_sample_abt == TRUE ~ "abt")) 

# Get count and proportion by data source
source_info_all %>%
  group_by(source) %>%
  summarize(count = n()) %>%
  mutate(prop = count / sum(count))

# Summarize sample composition by data source
desc_sample_plan_cont <- generate_desc_contin(filter(source_info_all, 
                                                     in_sample_abt == TRUE & source == "plan"), "plan") 
desc_sample_abt_cont <- generate_desc_contin(filter(source_info_all, 
                                                    in_sample_abt == TRUE & source == "abt"), "abt") 
desc_sample_web_cont <- generate_desc_contin(filter(source_info_all, 
                                                    in_sample_abt == TRUE & source %in% c("website", "email")), "web") 

desc_sample_sources_cont <- left_join(desc_sample_plan_cont, 
                                      desc_sample_web_cont, by = "variable") %>%
  left_join(desc_sample_abt_cont, by = "variable") %>%
  # Grab the medians only
  select(variable, starts_with("q2_"))


colnames(desc_sample_sources_cont) <- c("Variable", "Plans", 
                                                  "Web", "Abt")
desc_sample_sources_cont <- add_column(desc_sample_sources_cont,
                                       variable_lab = var_row_labels,
                                       .before = "Variable")

desc_sample_plan_cat <- generate_desc_cat(filter(source_info_all, 
                                                 in_sample_abt == TRUE & source == "plan"), "plan") 
desc_sample_abt_cat <- generate_desc_cat(filter(source_info_all, 
                                                in_sample_abt == TRUE & source == "abt"), "abt") 
desc_sample_web_cat <- generate_desc_cat(filter(source_info_all, 
                                                in_sample_abt == TRUE & source %in% c("website", "email")), "web") 

desc_sample_sources_cat <- left_join(desc_sample_plan_cat, 
                                     desc_sample_web_cat, by = "variable") %>%
  left_join( desc_sample_abt_cat, by = "variable")

# Output 
print(xtable(desc_sample_sources_cont[,-2],
             digits = 0, align = "llrrr",  
             caption = "Characteristics of housing authorities in sample by data source"), 
      comment = F, table.placement = "!htpb",
      include.rownames = FALSE)

print(xtable(desc_sample_sources_cat, 
             digits = 0, align = "llrrr", 
             caption = "Characteristics of housing authorities in sample by data source"), 
      comment = F, table.placement = "!htpb",
      include.rownames = FALSE)


##################################
## Voucher population weighting ##
##################################
# Calculate the proportion of all vouchers covered by the analytic sample 
sum(phas$hud_med_month_vouchers[phas$in_sample_abt == TRUE]) / 
  sum(phas$hud_med_month_vouchers)

